library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(datarium)
library(ggplot2)
library(ggpubr)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mlbench)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Load the data
calihousing <- read_csv("housing.csv")
## Rows: 20640 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ocean_proximity
## dbl (9): longitude, latitude, housing_median_age, total_rooms, total_bedroom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
housing <- as.data.frame(calihousing)
housing1 <- housing[, -which(names(housing) == "ocean_proximity" )]
housing2 <- housing1[, -which(names(housing) == "total_bedrooms" )]
dim(housing2)
## [1] 20640 8
head(housing2)
## longitude latitude housing_median_age total_rooms population households
## 1 -122.23 37.88 41 880 322 126
## 2 -122.22 37.86 21 7099 2401 1138
## 3 -122.24 37.85 52 1467 496 177
## 4 -122.25 37.85 52 1274 558 219
## 5 -122.25 37.85 52 1627 565 259
## 6 -122.25 37.85 52 919 413 193
## median_income median_house_value
## 1 8.3252 452600
## 2 8.3014 358500
## 3 7.2574 352100
## 4 5.6431 341300
## 5 3.8462 342200
## 6 4.0368 269700
# the first step before to build a model
cor(housing2)
## longitude latitude housing_median_age total_rooms
## longitude 1.00000000 -0.92466443 -0.10819681 0.04456798
## latitude -0.92466443 1.00000000 0.01117267 -0.03609960
## housing_median_age -0.10819681 0.01117267 1.00000000 -0.36126220
## total_rooms 0.04456798 -0.03609960 -0.36126220 1.00000000
## population 0.09977322 -0.10878475 -0.29624424 0.85712597
## households 0.05531009 -0.07103543 -0.30291601 0.91848449
## median_income -0.01517587 -0.07980913 -0.11903399 0.19804965
## median_house_value -0.04596662 -0.14416028 0.10562341 0.13415311
## population households median_income median_house_value
## longitude 0.099773223 0.05531009 -0.015175865 -0.04596662
## latitude -0.108784747 -0.07103543 -0.079809127 -0.14416028
## housing_median_age -0.296244240 -0.30291601 -0.119033990 0.10562341
## total_rooms 0.857125973 0.91848449 0.198049645 0.13415311
## population 1.000000000 0.90722227 0.004834346 -0.02464968
## households 0.907222266 1.00000000 0.013033052 0.06584265
## median_income 0.004834346 0.01303305 1.000000000 0.68807521
## median_house_value -0.024649679 0.06584265 0.688075208 1.00000000
str(housing2)
## 'data.frame': 20640 obs. of 8 variables:
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ latitude : num 37.9 37.9 37.9 37.9 37.9 ...
## $ housing_median_age: num 41 21 52 52 52 52 52 52 42 52 ...
## $ total_rooms : num 880 7099 1467 1274 1627 ...
## $ population : num 322 2401 496 558 565 ...
## $ households : num 126 1138 177 219 259 ...
## $ median_income : num 8.33 8.3 7.26 5.64 3.85 ...
## $ median_house_value: num 452600 358500 352100 341300 342200 ...
summary(housing2)
## longitude latitude housing_median_age total_rooms
## Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 2
## 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00 1st Qu.: 1448
## Median :-118.5 Median :34.26 Median :29.00 Median : 2127
## Mean :-119.6 Mean :35.63 Mean :28.64 Mean : 2636
## 3rd Qu.:-118.0 3rd Qu.:37.71 3rd Qu.:37.00 3rd Qu.: 3148
## Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320
## population households median_income median_house_value
## Min. : 3 Min. : 1.0 Min. : 0.4999 Min. : 14999
## 1st Qu.: 787 1st Qu.: 280.0 1st Qu.: 2.5634 1st Qu.:119600
## Median : 1166 Median : 409.0 Median : 3.5348 Median :179700
## Mean : 1425 Mean : 499.5 Mean : 3.8707 Mean :206856
## 3rd Qu.: 1725 3rd Qu.: 605.0 3rd Qu.: 4.7432 3rd Qu.:264725
## Max. :35682 Max. :6082.0 Max. :15.0001 Max. :500001
# descriptive statistics
des <- stat.desc(housing2)
round(des, 3)
## longitude latitude housing_median_age total_rooms
## nbr.val 20640.000 20640.000 20640.000 20640.000
## nbr.null 0.000 0.000 0.000 0.000
## nbr.na 0.000 0.000 0.000 0.000
## min -124.350 32.540 1.000 2.000
## max -114.310 41.950 52.000 39320.000
## range 10.040 9.410 51.000 39318.000
## sum -2467918.700 735441.620 591119.000 54402150.000
## median -118.490 34.260 29.000 2127.000
## mean -119.570 35.632 28.639 2635.763
## SE.mean 0.014 0.015 0.088 15.185
## CI.mean.0.95 0.027 0.029 0.172 29.764
## var 4.014 4.562 158.396 4759445.106
## std.dev 2.004 2.136 12.586 2181.615
## coef.var -0.017 0.060 0.439 0.828
## population households median_income median_house_value
## nbr.val 20640.000 2.06400e+04 20640.000 2.064000e+04
## nbr.null 0.000 0.00000e+00 0.000 0.000000e+00
## nbr.na 0.000 0.00000e+00 0.000 0.000000e+00
## min 3.000 1.00000e+00 0.500 1.499900e+04
## max 35682.000 6.08200e+03 15.000 5.000010e+05
## range 35679.000 6.08100e+03 14.500 4.850020e+05
## sum 29421840.000 1.03105e+07 79890.650 4.269504e+09
## median 1166.000 4.09000e+02 3.535 1.797000e+05
## mean 1425.477 4.99540e+02 3.871 2.068558e+05
## SE.mean 7.883 2.66100e+00 0.013 8.032200e+02
## CI.mean.0.95 15.450 5.21600e+00 0.026 1.574374e+03
## var 1282470.457 1.46176e+05 3.609 1.331615e+10
## std.dev 1132.462 3.82330e+02 1.900 1.153956e+05
## coef.var 0.794 7.65000e-01 0.491 5.580000e-01
# try to understand the relationship of features with house value
gg1=ggplot(housing2, aes(median_income, median_house_value) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x) +
ggtitle("Simple linear regression model") +
xlab("Median income") + ylab("Average Price of the houses")
gg2=ggplot(housing2, aes(housing_median_age, median_house_value) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)+
ggtitle("Simple linear regression model") +
xlab("Average age of the houses") + ylab("Average Price of the houses")
gg3=ggplot(housing2, aes(total_rooms, median_house_value) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)+
ggtitle("Simple linear regression model") +
xlab("Number of bedrooms") + ylab("Average Price of the houses")
figure <- ggarrange(gg1,gg2,gg3,
labels = c("A", "B", "C"),
ncol = 2, nrow = 2)
figure # the relationships seem quite linear
# Build the MLR model
mod <- lm(median_house_value ~ housing_median_age + median_income + total_rooms , data = housing2 )
# Check the distribution of residuals
hist(mod$residuals) # Histogram of residuals
qqnorm(mod$residuals) # Q-Q plot of residuals# Shapiro-Wilk test for normality of residuals
# Build the Log transformation model to fix the normality of residual
mod_log <- lm(log(median_house_value) ~ housing_median_age + median_income + total_rooms, data = housing2)
# Check the distribution of residuals again
hist(mod_log$residuals) # Histogram of residuals
qqnorm(mod_log$residuals) # Q-Q plot of residuals
# Diagnostic plots for heteroscedasticity
# Residual vs. Fitted Values plot
plot(mod, which = 1)
# Scale-Location plot (Square root of standardized residuals vs. Fitted Values)
plot(mod, which = 3)
# Residuals vs. Leverage plot
plot(mod, which = 5)
# Apply weighted least squares regression to fic heteroscedasticity
weights <- 1/sqrt(abs(mod$residuals))
mod_wls <- lm(median_house_value ~ housing_median_age + median_income + total_rooms, data = housing2, weights = weights)
# test if it works or not
lmtest::bptest(mod_wls)
##
## studentized Breusch-Pagan test
##
## data: mod_wls
## BP = 4.9739, df = 3, p-value = 0.1737
# Build the Log model
mod2 <- glm(median_house_value ~ ocean_proximity, data = housing)
# Summarize the model
summary(mod)
##
## Call:
## lm(formula = median_house_value ~ housing_median_age + median_income +
## total_rooms, data = housing2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -584934 -53506 -15123 36448 450986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.433e+04 2.160e+03 -11.26 <2e-16 ***
## housing_median_age 1.975e+03 4.780e+01 41.32 <2e-16 ***
## median_income 4.247e+04 3.012e+02 140.98 <2e-16 ***
## total_rooms 3.888e+00 2.793e-01 13.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80480 on 20636 degrees of freedom
## Multiple R-squared: 0.5137, Adjusted R-squared: 0.5136
## F-statistic: 7266 on 3 and 20636 DF, p-value: < 2.2e-16
summary(mod2)
##
## Call:
## glm(formula = median_house_value ~ ocean_proximity, data = housing)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -236712 -66247 -21005 42273 375196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 240084 1054 227.804 < 2e-16 ***
## ocean_proximityINLAND -115279 1631 -70.686 < 2e-16 ***
## ocean_proximityISLAND 140356 45062 3.115 0.00184 **
## ocean_proximityNEAR BAY 19128 2354 8.125 4.71e-16 ***
## ocean_proximityNEAR OCEAN 9350 2220 4.212 2.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 10147556380)
##
## Null deviance: 2.7483e+14 on 20639 degrees of freedom
## Residual deviance: 2.0939e+14 on 20635 degrees of freedom
## AIC: 534137
##
## Number of Fisher Scoring iterations: 2
summary(mod_wls)
##
## Call:
## lm(formula = median_house_value ~ housing_median_age + median_income +
## total_rooms, data = housing2, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -21318 -3251 -995 2906 17634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.983e+04 1.452e+03 -20.54 <2e-16 ***
## housing_median_age 1.930e+03 3.258e+01 59.25 <2e-16 ***
## median_income 4.330e+04 2.084e+02 207.72 <2e-16 ***
## total_rooms 3.733e+00 1.987e-01 18.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4299 on 20636 degrees of freedom
## Multiple R-squared: 0.6987, Adjusted R-squared: 0.6987
## F-statistic: 1.595e+04 on 3 and 20636 DF, p-value: < 2.2e-16
# Set the random seed for reproducibility
set.seed(123)
# Subset the dataset for clustering
cluster_data <- housing[, c("total_rooms", "households")]
# Standardize the numerical variables
scaled_data <- scale(cluster_data[, c("total_rooms", "households")])
# Calculate Euclidean distance and perform hierarchical clustering
dd <- dist(scaled_data, method = "euclidean")
clusters <- hclust(dd, method = "complete")
# Determine the optimal number of clusters using the elbow method
wss <- numeric(10)
for (i in 1:10) {
kmeans_model <- kmeans(scaled_data, centers = i)
wss[i] <- kmeans_model$tot.withinss
}
# Plot the elbow curve
plot(1:10, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters",
ylab = "Within-Cluster Sum of Squares", ylim = c(min(wss, na.rm = TRUE), max(wss, na.rm = TRUE)))
# Determine the optimal number of clusters visually
k <- 3 # Set the desired number of clusters
# Perform clustering with the optimal number of clusters
cut <- cutree(clusters, k)
# Print the count of each ocean proximity category within each cluster
table(cut, housing$ocean_proximity)
##
## cut <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
## 1 8898 6316 5 2241 2610
## 2 227 232 0 49 45
## 3 11 3 0 0 3
# Visualize the real categories of the data points
p = ggplot(cluster_data, aes(total_rooms, households))
p + geom_point(aes(colour = factor(housing$ocean_proximity)), size = 4) + ggtitle("Real Categories")
# Visualize the clustering results
p = ggplot(cluster_data, aes(total_rooms, households))
p + geom_point(aes(colour = factor(cut)), size = 4) + ggtitle("Clustering Results")
# Visualize the clustering results with a legend
fviz_cluster(list(data = cluster_data, cluster = cut), stand = FALSE, addlegend = "bottom")
# Remove missing values
housing <- housing[complete.cases(housing), ]
# Supervised Classification - Predicting Housing Category
# Convert the housing category to a factor variable
housing$housing_median_age_category <- as.factor(ifelse(housing$housing_median_age > median(housing$housing_median_age), "old", "new"))
# Split the data into training and testing sets
set.seed(123)
train_indices <- sample(1:nrow(housing), 0.8 * nrow(housing))
train_data <- housing[train_indices, ]
test_data <- housing[-train_indices, ]
# Train a classification model (e.g., random forest)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
model <- randomForest(housing_median_age_category ~ ., data = train_data)
# Make predictions on the test data
predictions <- predict(model, newdata = test_data)
# Convert factor levels to match between predicted and actual values
test_data$housing_median_age_category <- factor(test_data$housing_median_age_category,
levels = levels(train_data$housing_median_age_category))
# Evaluate the model and generate confusion matrix
confusionMatrix(predictions, test_data$housing_median_age_category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction new old
## new 2077 0
## old 0 2010
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.5082
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5082
## Detection Rate : 0.5082
## Detection Prevalence : 0.5082
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : new
##